library(showtext)
showtext_auto(enable = TRUE)
font_add("自訂字體", "/Users/mac/Documents/in_sync_mac/in_sync_documents/fonts/TaipeiSansTCBeta-Regular.ttf")
library(tidyverse)
library(mgcv)
library(psych)
set.seed(1)
label <- "/Users/mac/Documents/GitHub/GIL/CCKF/dietcoke/data/chunked_freq/20211115-104047-dispersion/"
library(arrow)
fp <- paste0(label, "gam_df.parquet")
if(file.exists(fp)){df <- read_parquet(fp)}
df$text_slice <- as.numeric(df$text_slice)
mid_year columnany(is.na(df$mid_year)) == FALSE
## [1] TRUE
all(!is.na(df$mid_year)) == TRUE
## [1] TRUE
"wb114254" %in% df$urn == FALSE
## [1] TRUE
urn) are included?length(unique(df$urn))
## [1] 63
# Over-dispersed characters from the results of vocabulary growth curve
["子","曰","者","於","為","有","其","人","一","而","以","也","不","之"]
# Subsets of characters from 蔣(2005)
["布;燈", "鐘;錶;磐;篪", "槍;刀;劍;戟;炮", "鯨", "心", "城;池", "快;慢", "籽", "日;山;涉;踄", "大;小;高", "貫;毌;擐;關", "矢;誓", "歌;唱;和"]
# Over-dispersed mid-frequency characters
['避', '聚', '助', '縱', '殊', '假', '戒', '竟', '辨', '委', '屈', '負', '違', '仰', '庫', '畏', '屢', '惜', '逐', '符', '側', '覺', '貫', '稽', '稍']
# Under-dispersed mid-frequency characters
['瑭', '尿', '酳', '皝', '嚐', '緞', '痘', '礼', '楽', '与', '鄩', '爹', '獘', '𣗳', '𧰼', '𫠦', '淂', '縀', '媽', '𡻕', '啊', '捌', '啥', '㭍', '叁']
chars <- unique(df$char)
char_df <- data.frame(char=chars, char_id=1:length(chars))
df <- left_join(df, char_df)
## Joining, by = "char"
kdf <- df %>%
select(mid_year, urn, text_slice) %>%
distinct() %>%
arrange(mid_year, urn, text_slice) %>%
mutate(k=row_number())
dynaspan_levels <- c("先秦", "漢", "魏晉南北", "唐五代十國", "宋元", "明", "清", "民國")
dynaspan_yearto_levels <- c(-221, 220, 589, 1125, 1368, 1644, 1911, 1949)
dynaspan_df <- data.frame(
dynaspan=dynaspan_levels,
dynaspan_yearto=dynaspan_yearto_levels
)
df <- left_join(df, kdf) %>%
left_join(dynaspan_df)
## Joining, by = c("urn", "text_slice", "mid_year")
## Joining, by = "dynaspan"
df$dynaspan <- factor(df$dynaspan, levels=dynaspan_levels)
var(df$raw_freq)
## [1] 5.150873
mean(df$raw_freq)
## [1] 0.9699206
ppois(0, mean(df$raw_freq)) < mean(df$raw_freq == 0)
## [1] TRUE
df %>% filter(raw_freq == 0) %>% nrow() / nrow(df)
## [1] 0.6747718
df$logfreq <- log(df$raw_freq+1)
ggplot(df, aes(raw_freq)) + geom_histogram(binwidth=1)
ggplot(df, aes(logfreq)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
par(mfrow=c(2,1))
plot(density(df$raw_freq))
plot(density(df$logfreq))
par(mfrow=c(2,1))
mid_year_distribution <- df %>%
select(mid_year, urn, text_slice) %>%
distinct() %>%
group_by(mid_year) %>%
count() %>%
arrange(desc(n))
plot(density(mid_year_distribution$mid_year))
abline(v=dynaspan_yearto_levels, col="lightblue")
text(dynaspan_yearto_levels, 0.000025, dynaspan_levels,
cex=0.65, pos=2,col="lightblue")
rug(mid_year_distribution$mid_year, ticksize=-0.1, col="blue") # side=3
mid_year_distribution$mid_year_scaled <- scale(mid_year_distribution$mid_year)
hist(mid_year_distribution$mid_year_scaled)
df %>% select(dynaspan, k, urn) %>% distinct() %>%
group_by(dynaspan) %>%
summarize(`Number of texts`=length(unique(urn)), `Number of text slices`=length(k))
## # A tibble: 8 × 3
## dynaspan `Number of texts` `Number of text slices`
## <fct> <int> <int>
## 1 先秦 1 12
## 2 漢 10 499
## 3 魏晉南北 10 218
## 4 唐五代十國 10 248
## 5 宋元 10 312
## 6 明 10 293
## 7 清 10 178
## 8 民國 2 256
library(readr)
write_csv(df, "gam_df2.csv")
head(df)
## # A tibble: 6 × 14
## dynaspan char raw_freq urn text_slice dzg title mid_year author_norm
## <fct> <chr> <int> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 先秦 避 1 wb422571 0 道藏/藏… 列子 -412 列御寇
## 2 先秦 啥 0 wb422571 7 道藏/藏… 列子 -412 列御寇
## 3 先秦 㭍 0 wb422571 7 道藏/藏… 列子 -412 列御寇
## 4 先秦 叁 0 wb422571 7 道藏/藏… 列子 -412 列御寇
## 5 先秦 避 0 wb422571 8 道藏/藏… 列子 -412 列御寇
## 6 先秦 聚 4 wb422571 8 道藏/藏… 列子 -412 列御寇
## # … with 5 more variables: __index_level_0__ <int>, char_id <int>, k <int>,
## # dynaspan_yearto <dbl>, logfreq <dbl>
df %>%
keep(is.numeric) %>%
lowerCor()
## rw_fr txt_s md_yr ____0 chr_d k dyns_ lgfrq
## raw_freq 1.00
## text_slice 0.01 1.00
## mid_year 0.03 0.16 1.00
## __index_level_0__ 0.00 0.22 -0.21 1.00
## char_id 0.01 0.00 0.00 0.00 1.00
## k 0.02 0.20 0.99 -0.18 0.00 1.00
## dynaspan_yearto 0.02 0.09 0.96 -0.34 0.00 0.96 1.00
## logfreq 0.86 0.01 0.05 -0.02 0.03 0.04 0.03 1.00
#df %>%
# keep(is.numeric) %>%
# cor()
# culmulative number of text slices
[12, 687, 1868, 4094, 12537, 20026, 45804, 46060]
df %>%
filter(raw_freq>0) %>%
ggplot(aes(k, char_id)) +
geom_point(size=0.5) +
theme_minimal() +
scale_y_continuous(breaks=1:length(unique(df$char)), labels=paste0(1:length(chars), "_", chars)) +
labs(x="Text slice", y="Character")
folder <- paste0(label, "gam_models")
models <- lapply(list.files(folder, full=TRUE), readRDS)
gam_df <- data.frame(
formula=lapply(models, function(x){
formula <- as.character(x$formula)
formula <- paste(formula[[2]], formula[[1]], formula[[3]])
return(formula)
}) %>% unlist(),
AIC=lapply(models, AIC) %>% unlist(),
BIC=lapply(models, BIC) %>% unlist()
) %>%
mutate(model_id=row_number()) %>%
arrange(AIC, formula) %>%
select(model_id, AIC, BIC, formula); gam_df
## model_id AIC BIC
## 1 1 204915.3 208330.1
## 2 3 209323.2 215949.7
## 3 2 213227.2 219325.5
## formula
## 1 raw_freq ~ s(k, by = char, id = 1, m = 1) + s(char, bs = "re")
## 2 raw_freq ~ s(k, char, bs = "fs", m = 1, k = 20) + s(author_norm, bs = "re")
## 3 raw_freq ~ s(k, char, bs = "fs", m = 1, k = 20)
coef_mats <- lapply(models, function(x){
coefs <- coef(x)
coefs <- coefs[2:length(coefs)]
coef_mat <- matrix(coefs, nrow=length(chars))
return(coef_mat)
})
## Warning in matrix(coefs, nrow = length(chars)): data length [1056] is not a sub-
## multiple or multiple of the number of rows [50]
coef_dfs <- lapply(coef_mats, function(x){
coef_df <- data.frame(x)
rownames(coef_df) <- paste0(1:length(chars), "_", chars)
return(coef_df)
})
model <- models[[1]]
model$formula
## raw_freq ~ s(k, by = char, id = 1, m = 1) + s(char, bs = "re")
print(summary(model$gam))
## Length Class Mode
## 0 NULL NULL
coef_mat <- coef_mats[[1]]
coef_df <- coef_dfs[[1]]
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.dist <- get_dist(coef_df, stand=FALSE, method = "euclidean")
fviz_dist(res.dist,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
for(method in c("silhouette", "wss", "gap_stat")){
print(fviz_nbclust(coef_mat, kmeans, method=method))
}
km.res <- kmeans(coef_df, 2, nstart = 25)
fviz_cluster(km.res, data = coef_df,
palette = "jco", ggtheme = theme_minimal(), geom="text")
library(cluster)
pam.res <- pam(coef_df, 2)
fviz_cluster(pam.res,
palette = "jco", ggtheme = theme_minimal(), geom="text")
res.hc <- hclust(dist(coef_df), method="ward.D2")
fviz_dend(res.hc, k = 2,
cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE,
rect = TRUE
)
## Warning in get_col(col, k): Length of color vector was longer than the number of
## clusters - first k elements are used
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
res.hc <- coef_df %>%
eclust("hclust", k = 20, graph = FALSE)
fviz_dend(res.hc,
cex = 0.5, palette = "jco", rect = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 2 0.32
## 2 2 2 0.44
## 3 3 1 0.00
## 4 4 10 0.56
## 5 5 4 0.36
## 6 6 7 0.64
## 7 7 3 0.61
## 8 8 1 0.00
## 9 9 3 0.43
## 10 10 1 0.00
## 11 11 1 0.00
## 12 12 1 0.00
## 13 13 3 0.10
## 14 14 2 0.71
## 15 15 1 0.00
## 16 16 1 0.00
## 17 17 2 0.38
## 18 18 3 0.36
## 19 19 1 0.00
## 20 20 1 0.00
# Silhouette width of observations
sil <- res.hc$silinfo$widths[, 1:3]
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
## cluster neighbor sil_width
## 44_側 13 4 -0.09563395
res.fanny <- fanny(coef_df, k=2)
## Warning in fanny(coef_df, k = 2): FANNY algorithm has not converged in 'maxit' =
## 500 iterations
## Warning in fanny(coef_df, k = 2): the memberships are all very close to 1/k.
## Maybe decrease 'memb.exp' ?
fviz_cluster(res.fanny,
palette = "jco", ggtheme = theme_minimal())
fviz_silhouette(res.fanny,
palette = "jco", ggtheme = theme_minimal())
## cluster size ave.sil.width
## 1 1 25 0.44
## 2 2 25 0.09
library(fpc)
db <- fpc::dbscan(coef_df, eps = 5, MinPts = 2)
fviz_cluster(db, data = coef_df, stand = FALSE,
geom = "point", palette = "jco", ggtheme = theme_classic())
fviz_cluster(db, data = coef_df, stand = FALSE,
geom = "text", palette = "jco", ggtheme = theme_classic())
dbscan::kNNdistplot(coef_df, k = 5)
gradient.color <- list(low = "steelblue", high = "white")
coef_mat %>%
get_clust_tendency(n = 2, gradient = gradient.color)
## $hopkins_stat
## [1] 0.8442724
##
## $plot
Source: https://www.datanovia.com/en/blog/types-of-clustering-methods-overview-and-quick-start-r-code/
library(lattice)
df <- df %>%
separate(dzg, into=c("dzg_lev1", "dzg_lev2"), sep="/", remove=FALSE) %>%
mutate(char_label=paste0(char_id, "_", char), char=char_id)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 22500 rows [3113,
## 3114, 3115, 3116, 3117, 3118, 3119, 3120, 3121, 3122, 3123, 3124, 3125, 3126,
## 3127, 3128, 3129, 3130, 3131, 3132, ...].
df$char <- factor(df$char)
df$author_norm <- factor(df$author_norm)
df$dzg <- factor(df$dzg)
df$dzg_lev1 <- factor(df$dzg_lev1)
df$fit <- predict.gam(model, df)
new_df <- df %>%
group_by(char, char_id) %>%
summarize(fano=var(logfreq, na.rm=TRUE)/mean(logfreq, na.rm=TRUE))
## `summarise()` has grouped output by 'char'. You can override using the `.groups` argument.
new_df %>% arrange(desc(fano))
## # A tibble: 50 × 3
## # Groups: char [50]
## char char_id fano
## <fct> <int> <dbl>
## 1 7 7 3.89
## 2 23 23 3.33
## 3 25 25 2.33
## 4 30 30 2.31
## 5 24 24 2.13
## 6 38 38 1.97
## 7 22 22 1.92
## 8 50 50 1.75
## 9 31 31 1.48
## 10 17 17 1.39
## # … with 40 more rows
fano_quantiles <- quantile(new_df$fano, probs=c(0.1, 0.9), na.rm=TRUE)
indices_low <- new_df %>% filter(fano < fano_quantiles[1]) %>% pull(char_id)
indices_high <- new_df %>% filter(fano > fano_quantiles[2]) %>% pull(char_id)
par(mfrow=c(2,3))
for(i in indices_low){
plot(model, select=i, main=chars[i])
}
par(mfrow=c(2,3))
for(i in indices_high){
plot(model, select=i, main=chars[i])
}
options(plot.repr.height=12, plot.repr.width=12)
xyplot(logfreq ~ k|char_label, data=df, type="l")
bursty_df <- df %>%
group_by(char, char_id) %>%
summarize(burstiness=(logfreq/sum(logfreq, na.rm=TRUE))-(1/n())) %>%
mutate(is_bursty_period=burstiness > 0) %>%
arrange(desc(burstiness)) %>%
group_by(char, char_id) %>%
summarize(burstiness_median=median(burstiness)) %>%
arrange(desc(burstiness_median))
## `summarise()` has grouped output by 'char', 'char_id'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'char'. You can override using the `.groups` argument.
bursty_chars <- bursty_df %>% select(char_id) %>% distinct() %>% head(6) %>% pull(char_id)
## Adding missing grouping variables: `char`
par(mfrow=c(2,3))
for(i in bursty_chars){
plot(model, select=i, main=chars[i])
}
set.seed(1)
author_profile <- read_csv("/Users/mac/Documents/GitHub/GIL/CCKF/dietcoke/data/author_time/author_profile.csv") %>%
filter(!is.na(mid_year)) %>%
mutate(mid_year_grp=mid_year %/% 100) %>%
group_by(mid_year_grp) %>%
slice_sample(n=3)
## Warning: Missing column names filled in: 'X1' [1]
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_double(),
## name = col_character(),
## birth_year = col_double(),
## mort_year = col_double(),
## mid_year = col_double(),
## age = col_double(),
## urn = col_character(),
## title = col_character(),
## dynaspan = col_character(),
## year_source = col_character()
## )
par(mfrow=c(2,1))
hist(author_profile$mid_year)
plot(density(author_profile$mid_year))
abline(v=dynaspan_yearto_levels, col="lightblue")
text(dynaspan_yearto_levels, 0.000025, dynaspan_levels,
cex=0.65, pos=2,col="lightblue")
rug(author_profile$mid_year, ticksize=-0.1, col="blue")
df %>% select(dynaspan, k, urn) %>% distinct() %>%
group_by(dynaspan) %>%
summarize(`Number of texts`=length(unique(urn)), `Number of text slices`=length(k))
## # A tibble: 8 × 3
## dynaspan `Number of texts` `Number of text slices`
## <fct> <int> <int>
## 1 先秦 1 12
## 2 漢 10 499
## 3 魏晉南北 10 218
## 4 唐五代十國 10 248
## 5 宋元 10 312
## 6 明 10 293
## 7 清 10 178
## 8 民國 2 256